# check_format.py
DATA_FILE = "../data/owt_tokens_IDs"

print("🔍 查看 owt_tokens_IDs 前 5 行：\n")
count = 0
with open(DATA_FILE, 'r', encoding='utf-8') as f:
    for line in f:
        if count >= 5:
            break
        print(f"第 {count+1} 行: {repr(line.strip())}")
        print(f"  长度: {len(line.strip().split())} 个 token\n")
        count += 1

# 统计总行数（可选，大文件会慢）
print("📊 正在统计总行数...")
total_lines = sum(1 for _ in open(DATA_FILE, 'r', encoding='utf-8'))
print(f"✅ 共 {total_lines:,} 行")